{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "176f38b3-192c-4aed-b0df-dcdf97ad1831",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import (\n",
    "    decoders,\n",
    "    models,\n",
    "    normalizers,\n",
    "    pre_tokenizers,\n",
    "    processors,\n",
    "    trainers,\n",
    "    Tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ec8ea19b-355a-4db1-92b8-c8e155ffc5a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/liming/anaconda3/envs/pytorch/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text'],\n",
       "        num_rows: 2944826\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset_ori = load_dataset(\"text\", data_files=['human3.fna.line.bpe'])\n",
    "dataset_ori"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2f90376b-b937-4525-9106-9825055d5076",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 1000000\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_select = dataset_ori[\"train\"].shuffle(seed=42).select(range(1000000))\n",
    "dataset_select"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7df921fe-a764-4f3e-aae8-425884abfb18",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(models.Unigram()) #vocab (List[Tuple[str, float]], optional) – A list of vocabulary items and their relative score [(“am”, -0.2442),…]\n",
    "#models.Unigram()， 可以用一个字典初始化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ee784b0b-1e6a-432b-85d8-ec667518d8f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from tokenizers import Regex\n",
    "\n",
    "# tokenizer.normalizer = normalizers.Sequence(\n",
    "#     [\n",
    "#         normalizers.Replace(\"``\", '\"'),\n",
    "#         normalizers.Replace(\"''\", '\"'),\n",
    "#         normalizers.NFKD(),\n",
    "#         normalizers.StripAccents(),\n",
    "#         normalizers.Replace(Regex(\" {2,}\"), \" \"),\n",
    "#     ]\n",
    "#)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b9eedf6e-676b-424e-ac08-199ffecd9fad",
   "metadata": {},
   "outputs": [],
   "source": [
    "#tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()#此预标记器用提供的替换字符替换任何空格。然后它尝试在这些空间上进行分裂"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "46683a08-f4a9-41df-9192-a65ab414d586",
   "metadata": {},
   "outputs": [],
   "source": [
    "#tokenizer.pre_tokenizer.pre_tokenize_str(\"Let's test the pre-tokenizer!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "86445377-7c11-4687-bc8c-7d0bb30933df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "time cost s: 86382\n",
      "['GGGGCT', 'GGAGAAGGGG', 'AGAAGAGGAAA', 'GTGAGGT', 'TGCCT']\n"
     ]
    }
   ],
   "source": [
    "special_tokens = [\"<cls>\", \"<sep>\", \"<unk>\", \"<pad>\", \"<mask>\", \"<s>\", \"</s>\"]\n",
    "trainer = trainers.UnigramTrainer(\n",
    "    vocab_size=50000, special_tokens=special_tokens, unk_token=\"<unk>\",max_piece_length=12\n",
    ")\n",
    "def get_training_corpus(dataset_select):\n",
    "    for i in range(0, len(dataset_select), 1000):\n",
    "        yield dataset_select[i : i + 1000][\"text\"]\n",
    "\n",
    "import time\n",
    "\n",
    "start_time = time.time()\n",
    "tokenizer.train_from_iterator(get_training_corpus(dataset_select), trainer=trainer)\n",
    "end_time = time.time()\n",
    "\n",
    "print(\"time cost s:\", int(end_time-start_time))\n",
    "#tokenizer.model = models.Unigram()\n",
    "#tokenizer.train([\"human3.fna.line.bpe\"], trainer=trainer)\n",
    "\n",
    "encoding = tokenizer.encode(\"GGGGCTGGAGAAGGGGAGAAGAGGAAAGTGAGGTTGCCT\")\n",
    "print(encoding.tokens)\n",
    "tokenizer.save(\"tokenizer8.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e15eab4b-4936-4ee7-b94c-aaf3a83ebf23",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['GGGGCT', 'GGAGAAGGGG', 'AGAAGAGGAAA', 'GTGAGGT', 'TGCCT']\n"
     ]
    }
   ],
   "source": [
    "encoding = tokenizer.encode(\"GGGGCTGGAGAAGGGGAGAAGAGGAAAGTGAGGTTGCCT\")\n",
    "print(encoding.tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f7db5562-0845-46c0-90f0-61d36c4d6dd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(\"tokenizer8.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1906aa96-a00d-4797-afe2-d6b237c47750",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
